In [1]:
import os
import io
import math
import random
import requests
from tqdm import tqdm
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
def download_file(url, filename):
r = requests.get(url, stream=True)
total_size = int(r.headers.get('content-length', 0));
block_size = 1024
total_kb_size = math.ceil(total_size//block_size)
wrote = 0
with open(filename, 'wb') as f:
for data in tqdm(r.iter_content(block_size), total=total_kb_size , unit='KB', unit_scale=True):
wrote = wrote + len(data)
f.write(data)
In [3]:
datafile = "eth-eur.csv"
#import from server
if not os.path.exists(datafile):
download_file("https://www.coingecko.com/price_charts/export/279/eur.csv", datafile)
In [4]:
data = pd.read_csv(datafile)
#print a random sample
data.iloc[random.randint(0, data.shape[0])]
Out[4]:
In [5]:
data.info()
Here we can see that every sample is defined by the day in a date format, the current price, the capital market and the total volume of transactions that have been done that day.
At first glance, they are good indicators so all of them will be used as features.
In [6]:
#customize index
data.snapped_at[0].split()[0]
data.snapped_at = data.snapped_at.apply(lambda x: x.split()[0])
In [7]:
data.set_index('snapped_at', inplace=True)
data.index = pd.to_datetime(data.index)
In [8]:
features = ['price', 'market_cap', 'total_volume']
In [9]:
data[features].plot(subplots=True, layout=(1,3), figsize=(20,4));
In [10]:
data.iloc[0:10]
Out[10]:
The list is not complete (2015-08-09 is missing) so we have to fill the blanks.
In [11]:
#check
'2015-08-09 00:00:00' in data.index
Out[11]:
In [12]:
#Generate all the possible days and use them to reindex
start = data.index[data.index.argmin()]
end = data.index[data.index.argmax()]
index_complete = pd.date_range(start, end)
data = data.reindex(index_complete)
In [13]:
#Fill the blanks with the mean between the previous and the day after
for idx in data.index:
dayloc = data.index.get_loc(idx)
day = data.loc[idx]
if day.hasnans:
#updating
rg = slice(dayloc-1, dayloc+2)
data.loc[idx] = data.iloc[rg].mean()
print("Day <{}> updated".format(idx))
In [14]:
#check
data.loc['2015-08-09 00:00:00']
Out[14]:
In [15]:
#Checking if we have NaN in another place
data[data.isnull().any(axis=1)].count()
Out[15]:
Now we need to include a new feature which will define the closed price for every sample. Ethereum market is always open so we can forget about weekends and use directly the open price of the next sample.
Afterwards the model will use this feature as the target since it's the value we try to predict.
The following script will help us with that.
In [16]:
new_column = 'closed_price'
datab = data.copy()
nc = list()
for idx in data.index:
dayloc = data.index.get_loc(idx)
#we put the price in the day after as closed price
if dayloc == len(data.index)-1:
#last position will not have closed_price
closed_price = np.nan
else:
closed_price = data.iloc[dayloc+1].price
nc.append(closed_price)
data[new_column] = nc
data.tail(5)
Out[16]:
In [17]:
#Delete last because we don't know still the closed price
data = data.drop(data.index[len(data)-1])
In [18]:
#X_train, X_test, y_train, y_test = train_test_split(data[features],
# data.closed_price,
# test_size=0.20,
# shuffle=False,
# random_state=42)
#80% for training
split = round(len(data)*0.9)
data_train, data_test = data[:split].copy(), data[split:].copy()
In [19]:
print("Size data_train: {}".format(data_train.shape[0]))
print("Size data_test: {}".format(data_test.shape[0]))
Take care of this because we don't know if the future values are in the range. For this reason we'll fit the scaler using only the training data and NOT the testing data.
Standardization is a well known normalizer that uses the standard deviation thinking the dataset follows a Gaussian distribution (it's specially robust for new values outside of the expected values).
*Note: this method assumes the distribution of data fits to Gaussian distribution.
In [20]:
#Scale the data
scaler = StandardScaler()
data_train_norm, data_test_norm = data_train.copy(), data_test.copy()
data_train_norm[data.columns] = scaler.fit_transform(data_train[data.columns])
data_test_norm[data.columns] = scaler.transform(data_test[data.columns])
data_test_norm.describe()
Out[20]:
In [21]:
from distutils.version import LooseVersion
import warnings
import tensorflow as tf
# Check TensorFlow Version
assert LooseVersion(tf.__version__) >= LooseVersion('1.0'), 'Please use TensorFlow version 1.0 or newer'
print('TensorFlow Version: {}'.format(tf.__version__))
# Check for a GPU
if not tf.test.gpu_device_name():
warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
In [22]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
X_train = data_train_norm[features].values.reshape((data_train_norm.shape[0], 1, 3))
y_train = data_train_norm.closed_price.values
X_test = data_test_norm[features].values.reshape((data_test_norm.shape[0], 1, 3))
y_test = data_test_norm.closed_price.values
In [23]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
In [24]:
model = Sequential()
model.add(LSTM(32, input_shape=(1, 3) ))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
Out[24]:
In [25]:
print("Training) R^2 score: {:.3f}".format(r2_score(y_train, model.predict(X_train))))
print("Testing) R^2 score: {:.3f}".format(r2_score(y_test, model.predict(X_test))))
In [26]:
pred = model.predict(X_train)
plt.plot(y_train, label='Actual')
plt.plot(pred, label='Prediction')
plt.legend()
Out[26]:
In [27]:
#saving
model_1_3 = model
In [28]:
'''
Helper function to transform the dataset to
shapes defined by 7 steps and 3 features
'''
def prepare_sequence(data, sequence_size=7):
sequence = []
buckets = data.shape[0]//sequence_size
init_sample = data.shape[0] - buckets*sequence_size
samples = 0
for i in range(init_sample, data.shape[0] - sequence_size + 1):
sequence.append(data[i:i+sequence_size])
samples += 1
return np.concatenate(sequence).reshape((samples, sequence_size, data.shape[1]))
prepare_sequence(data[features]).shape
Out[28]:
In [29]:
#getting (samples, steps, features)
X_train = prepare_sequence(data_train_norm[features])
X_test = prepare_sequence(data_test_norm[features])
y_train = data_train_norm.iloc[-len(X_train):].closed_price.values
y_test = data_test_norm.iloc[-len(X_test):].closed_price.values
In [30]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
In [31]:
model = Sequential()
model.add(LSTM(32, input_shape=(7, 3) ))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
Out[31]:
In [32]:
print("Training) R^2 score: {:.3f}".format(r2_score(y_train, model.predict(X_train))))
print("Testing) R^2 score: {:.3f}".format(r2_score(y_test, model.predict(X_test))))
In [33]:
pred = model.predict(X_train)
plt.plot(y_train, label='Actual')
plt.plot(pred, label='Prediction')
plt.legend()
Out[33]:
In [34]:
#saving
model_7_3 = model
The neural network is not able to get good predictions for that data that has not seen before. For that reason we can find day that are not well fitted. This problem is related to the 'out-of-scale' data inputs.
Thinking that the batch size is a window of days that defines how the neural network learns, one idea is to normalize the window by the last sample. On this way we'll be able to keep almost all data in the same scale.
In [35]:
def print_mean_std(data):
mean = np.mean(data)
std = np.std(data)
print("mean:{:.3f} std:{:.3f}".format(mean, std))
In [36]:
def window_normalization(data, window_size):
y = np.empty_like(data, dtype='float64')
normalizer = list()
for i in range(0,len(data), window_size):
j = min(i+window_size, len(data))
y[i:j] = data[i:j]/np.abs(data[j-1])
normalizer.append(np.abs(data[j-1]))
#print_mean_std(y[i:j])
return y, normalizer
def window_denormalization(norm_data, normalizer, window_size):
y = np.empty_like(norm_data, dtype='float64')
idx = 0
for i in range(0,len(norm_data), window_size):
j = min(i+window_size, len(norm_data))
y[i:j] = norm_data[i:j]*normalizer[idx]
idx += 1
return y
In [37]:
#testing the function
a = np.array([[1, 1, 1], [2, 2, 2], [2, 2, 2], [8, 8, 8]])
expected_result = np.array([[0.5, 0.5, 0.5], [1, 1, 1], [0.25, 0.25, 0.25], [1, 1, 1]])
norm_a, normalizer = window_normalization(a, 2)
assert ( np.array_equal(norm_a, expected_result) )
assert ( np.array_equal(a, window_denormalization(norm_a, normalizer, 2)) )
In [38]:
#Showing the last sample
data.index[-1].strftime("%d-%m-%Y")
Out[38]:
In [39]:
window_size=32
X_train = data_train[features].values
y_train = data_train.closed_price.values
X_train_norm, _ = window_normalization(X_train, window_size)
y_train_norm, y_normalizer = window_normalization(y_train, window_size)
#getting (samples, steps, features)
X_train_norm = prepare_sequence(X_train_norm)
y_train_norm = y_train_norm[-len(X_train_norm):]
In [40]:
model = Sequential()
model.add(LSTM(32, input_shape=(7,3) ))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train_norm, y_train_norm, epochs=50, batch_size=window_size, verbose=0)
Out[40]:
In [41]:
X_test = data_test[features].values
y_test = data_test.closed_price.values
X_test_norm, _ = window_normalization(X_test, window_size)
y_test_norm, y_scaler = window_normalization(y_test, window_size)
#getting (samples, steps, features)
X_test_norm = prepare_sequence(X_test_norm)
y_test_norm = y_test_norm[-len(X_test_norm):]
In [42]:
print("Training) R^2 score: {:.3f}".format(r2_score(y_train_norm, model.predict(X_train_norm))))
print("Testing) R^2 score: {:.3f}".format(r2_score(y_test_norm, model.predict(X_test_norm))))
In [43]:
pred = model.predict(X_train_norm)
plt.plot(y_train_norm, label='Actual')
plt.plot(pred, label='Prediction')
plt.legend()
Out[43]:
In [44]:
#saving
model_win = model
In [45]:
X_test = prepare_sequence(data_test_norm[features])
y_test = data_test_norm.iloc[-len(X_test):].closed_price.values
pred = model_7_3.predict(X_test)
plt.plot(y_test, label='Actual')
plt.plot(pred, label='Prediction')
plt.legend()
Out[45]: